from __future__ import print_function
import os.path
import pandas as pd
import sys
sys.path.insert(0, '../../')
import seaborn as sns
import numpy as np
from JKBio.Helper import *
from JKBio.helper import pyDESeq2
from bokeh.plotting import *
from bokeh.models import HoverTool
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.manifold import MDS, TSNE
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
#from umap import UMAP
output_notebook()
%load_ext autoreload
%matplotlib inline
%autoreload 2
%load_ext rpy2.ipython
downloading their data
version="v2"
project="slamseqMax"
location= '../data/'+project+"/"
mkdir ../$location
!gsutil ls gs://transfer-amlproject/200827_MP8179_fastq/
! gsutil -m cp gs://transfer-amlproject/slamseq_inhibitor_spikeins/*MP8179*.fastq.gz ../../data/$project/fastqs/
ls ../../data/$project/fastqs
#if need to download the data from sra
! mkdir ../../data/slam/MYCpaper && cd ../../data/slam/MYCpaper && fastq_dump SRR5806781 && fastq_dump SRR5806783 && fastq_dump SRR5806785 &&
fastq_dump SRR5806780 && fastq_dump SRR5806782 && fastq_dump SRR5806784 && cd -
folder_bams = "../../data/"+project+'/fastqs'
bams = ! ls $folder_bams/*
bams
rename = {
"20200707_1_MP8000_S65": "mr193-MV411-MYBi_30m-r1",
"20200707_2_MP8000_S66": "mr194-MV411-MYBi_30m-r2",
"20200707_3_MP8000_S67": "mr195-MV411-MYBi_30m-r3",
"20200707_4_MP8000_S68": "mr196-MV411-MYBi_30m-r4",
"20200707_5_MP8000_S69": "mr197-MV411-PBS_30m-r1",
"20200707_6_MP8000_S70": "mr198-MV411-PBS_30m-r2",
"20200707_7_MP8000_S71": "mr199-MV411-PBS_30m-r3",
"20200707_8_MP8000_S72": "mr200-MV411-PBS_30m-r4",
"20200707_9_MP8000_S73": "mr201-MV411-MYBi_6h-r1",
"20200707_10_MP8000_S74": "mr202-MV411-MYBi_6h-r2",
"20200707_11_MP8000_S75": "mr203-MV411-MYBi_6h-r3",
"20200707_12_MP8000_S76": "mr204-MV411-MYBi_6h-r4",
"20200707_13_MP8000_S77": "mr205-MV411-PBS_6h-r1",
"20200707_14_MP8000_S78": "mr206-MV411-PBS_6h-r2",
"20200707_15_MP8000_S79": "mr207-MV411-PBS_6h-r3",
"20200703_16_MP800_S103": "mr208-MV411-MS2-r1",
"20200703_17_MP800_S104": "mr209-MV411-MS2-r2",
"20200703_18_MP800_S105": "mr210-MV411-MS2-r3",
"20200703_19_MP800_S106": "mr211-MV411-JQ1-r1",
"20200703_20_MP800_S107": "mr212-MV411-JQ1-r2",
"20200703_21_MP800_S108": "mr213-MV411-JQ1-r3",
"20200703_22_MP800_S109": "mr214-MV411-MS2_JQ1-r1",
"20200703_23_MP800_S110": "mr215-MV411-MS2_JQ1-r2",
"20200703_24_MP800_S111": "mr216-MV411-MS2_JQ1-r3",
"20200703_25_MP800_S112": "mr217-MV411-DMSO-r1",
"20200703_26_MP800_S113": "mr218-MV411-DMSO-r2",
"20200703_27_MP800_S114": "mr219-MV411-DMSO-r3",
"20200827_1_MP8179_S160": "mr252-MV411_MEF2D-DMSO_2h-r1",
"20200827_2_MP8179_S161": "mr253-MV411_MEF2D-DMSO_2h-r2",
"20200827_3_MP8179_S162": "mr254-MV411_MEF2D-DMSO_2h-r3",
"20200827_4_MP8179_S163": "mr255-MV411_MEF2D-DMSO_2h-r4",
"20200827_5_MP8179_S164": "mr256-MV411_MEF2D-VHL_2h-r1",
"20200827_6_MP8179_S165": "mr257-MV411_MEF2D-VHL_2h-r2",
"20200827_7_MP8179_S166": "mr258-MV411_MEF2D-VHL_2h-r3",
"20200827_8_MP8179_S167": "mr259-MV411_MEF2D-VHL_2h-r4",
"20200827_9_MP8179_S168": "mr260-MV411_MEF2D-DMSO_24h-r1",
"20200827_10_MP8179_S169": "mr261-MV411_MEF2D-DMSO_24h-r2",
"20200827_11_MP8179_S170": "mr262-MV411_MEF2D-DMSO_24h-r3",
"20200827_12_MP8179_S171": "mr263-MV411_MEF2D-DMSO_24h-r4",
"20200827_13_MP8179_S172": "mr264-MV411_MEF2D-VHL_24h-r1",
"20200827_14_MP8179_S173": "mr265-MV411_MEF2D-VHL_24h-r2",
"20200827_15_MP8179_S174": "mr266-MV411_MEF2D-VHL_24h-r3",
"20200827_16_MP8179_S175": "mr267-MV411_MEF2D-VHL_24h-r4",
"20200827_17_MP8179_S176": "mr268-MV411-PBS_24h-r1",
"20200827_18_MP8179_S177": "mr269-MV411-PBS_24h-r2",
"20200827_19_MP8179_S178": "mr270-MV411-PBS_24h-r3",
"20200827_20_MP8179_S179": "mr271-MV411-PBS_24h-r4",
"20200827_21_MP8179_S180": "mr272-MV411-MYBi_24h-r1",
"20200827_23_MP8179_S181": "mr273-MV411-MYBi_24h-r2",
"20200827_24_MP8179_S182": "mr274-MV411-MYBi_24h-r3",
}
for val in bams:
ren = val
for old, new in rename.items():
ren = ren.replace(old, new)
if ren !=val:
! mv $val $ren
folder="../../data/"+project
folder_bams = folder+'/fastqs/'
bams = ! ls $folder_bams/*.fastq.gz
bams
! pip3 install git+https://github.com/jkobject/slamdunk.git --upgrade
# please also install trimgalore and cutadapt
parrun(['../../TrimGalore-0.6.5/trim_galore --paired --cores 4 --retain_unpaired -stringency 3 --illumina '+val1+ ' '+val2+' -o '+folder_bams for val1,val2 in grouped(bams,2)],2)
rm fastqs/*.fastq.gz
ls -al ../../data/$project/fastqs
# using an ERCC ref genome (you can just append ERCC fasta to the hg38 fasta)
refgenome="../../data/ref/Homo_sapiens_assembly38_ERCC92.fasta"
bams
# we are doing it paired end
parrun(['slamdunk all -r '+refgenome+' -b ../data/Muhar_Slamseq/GSE100708_hg38_refseq_062016_ensemblv84_3UTR.bed \
-o ../../data/'+project+'/res/ -t 8 -c 2 -N '+val1.split('/')[-1].split('_R')[0]+' '+val1.replace('.fastq.gz','_val_1.fq.gz')+" "+val2.replace('.fastq.gz','_val_2.fq.gz') for val1,val2 in grouped(bams[4:],2)], 2)
ls ../../data/$project/res/count/*.tsv
!gsutil -m cp -r ../../data/$project/* gs://amlproject/RNA/slamseq_iBet_max/
counts = {}
folder = "../../data/slamseqMax/res/filter"
files = ! ls $folder/mr*
files = [file.split('/')[-1] for file in files if file.endswith(".bam")]
files
parrun(["samtools view -hb "+folder+"/"+f+" ERCC-00002 ERCC-00003 ERCC-00004 ERCC-00009 ERCC-00012 ERCC-00013 ERCC-00014 ERCC-00016 ERCC-00017 ERCC-00019 ERCC-00022 ERCC-00024 ERCC-00025 ERCC-00028 ERCC-00031 ERCC-00033 ERCC-00034 ERCC-00035 ERCC-00039 ERCC-00040 ERCC-00041 ERCC-00042 ERCC-00043 ERCC-00044 ERCC-00046 ERCC-00048 ERCC-00051 ERCC-00053 ERCC-00054 ERCC-00057 ERCC-00058 ERCC-00059 ERCC-00060 ERCC-00061 ERCC-00062 ERCC-00067 ERCC-00069 ERCC-00071 ERCC-00073 ERCC-00074 ERCC-00075 ERCC-00076 ERCC-00077 ERCC-00078 ERCC-00079 ERCC-00081 ERCC-00083 ERCC-00084 ERCC-00085 ERCC-00086 ERCC-00092 ERCC-00095 ERCC-00096 ERCC-00097 ERCC-00098 ERCC-00099 ERCC-00104 ERCC-00108 ERCC-00109 ERCC-00111 ERCC-00112 ERCC-00113 ERCC-00116 ERCC-00117 ERCC-00120 ERCC-00123 ERCC-00126 ERCC-00130 ERCC-00131 ERCC-00134 ERCC-00136 ERCC-00137 ERCC-00138 ERCC-00142 ERCC-00143 ERCC-00144 ERCC-00145 ERCC-00147 ERCC-00148 ERCC-00150 ERCC-00154 ERCC-00156 ERCC-00157 ERCC-00158 ERCC-00160 ERCC-00162 ERCC-00163 ERCC-00164 ERCC-00165 ERCC-00168 ERCC-00170 ERCC-00171 > "+folder+"/ERCC_"+f for f in files],cores=10)
parrun(["bedtools genomecov -ibam "+folder+"/ERCC_"+f+" > "+folder+"/"+f+".bed" for f in files],cores=10)
ERCC = ["ERCC-00002", "ERCC-00003", "ERCC-00004", "ERCC-00009", "ERCC-00012", "ERCC-00013", "ERCC-00014", "ERCC-00016", "ERCC-00017", "ERCC-00019", "ERCC-00022", "ERCC-00024", "ERCC-00025", "ERCC-00028", "ERCC-00031", "ERCC-00033", "ERCC-00034", "ERCC-00035", "ERCC-00039", "ERCC-00040", "ERCC-00041", "ERCC-00042", "ERCC-00043", "ERCC-00044", "ERCC-00046", "ERCC-00048", "ERCC-00051", "ERCC-00053", "ERCC-00054", "ERCC-00057", "ERCC-00058", "ERCC-00059", "ERCC-00060", "ERCC-00061", "ERCC-00062", "ERCC-00067", "ERCC-00069", "ERCC-00071", "ERCC-00073", "ERCC-00074", "ERCC-00075", "ERCC-00076", "ERCC-00077", "ERCC-00078", "ERCC-00079", "ERCC-00081", "ERCC-00083", "ERCC-00084", "ERCC-00085", "ERCC-00086", "ERCC-00092", "ERCC-00095", "ERCC-00096", "ERCC-00097", "ERCC-00098", "ERCC-00099", "ERCC-00104", "ERCC-00108", "ERCC-00109", "ERCC-00111", "ERCC-00112", "ERCC-00113", "ERCC-00116", "ERCC-00117", "ERCC-00120", "ERCC-00123", "ERCC-00126", "ERCC-00130", "ERCC-00131", "ERCC-00134", "ERCC-00136", "ERCC-00137", "ERCC-00138", "ERCC-00142", "ERCC-00143", "ERCC-00144", "ERCC-00145", "ERCC-00147", "ERCC-00148", "ERCC-00150", "ERCC-00154", "ERCC-00156", "ERCC-00157", "ERCC-00158", "ERCC-00160", "ERCC-00162", "ERCC-00163", "ERCC-00164", "ERCC-00165", "ERCC-00168", "ERCC-00170", "ERCC-00171"]
res = {i:[] for i in files}
for val in files:
cov = pd.read_csv(folder+"/"+val+".bed",sep="\t",header=None)
for i in ERCC:
res[val].append(cov[cov[0]==i][1].mean())
df = pd.DataFrame(data=res,index=ERCC)
ls ../../data/slamseqMax/res/filter/mr*.bam
totalcounts = ! for unkn in $(ls ../../data/slamseqMax/res/filter/mr*.bam); do samtools view -c -F 260 $unkn; done
totalcounts
totalcounts= [
52834953,
48661773,
99303819,
45979870,
71673717,
90428614,
51950460,
75989518,
41520582,
44914503,
34919910,
34840133,
44955360,
50106552,
47137700,
148549270,
103180767,
160756237,
128740743,
109673222,
130928802,
147257131,
149990181,
155634820,
116426167,
124950994,
86078120,
87120807,
83131772,
169547340,
162737874,
60636179,
101791667,
149282005,
130879596,
204252217,
74266908,
124166412,
109829880,
79024803,
126029299,
93542923,
107586723,
85905103,
31694204,
128780021,
37635249,
36638284,
75047033,
83424257]
res = 10000*df.mean()/totalcounts[-23:]
res
prev=0
r={}
for val in [4,4,4,3,3,3,3,3,4,4,4,4,4,3][-6:]:
r[res.keys()[prev].split('-')[2]]=[np.mean(res[prev:prev+val]), np.var(res[prev:prev+val])**(1/2)]
prev+=val
r
d= pd.DataFrame(data=r.values(),index=r.keys(), columns=['ERCC pseudo-counts','var'])
d['Experiments']=d.index
sns.barplot("Experiments","ERCC pseudo-counts",data=d,ci=None,)
plt.errorbar(x=range(0,len(d)),y=d['ERCC pseudo-counts'],
yerr=d['var'], fmt='none', c= 'r')
plt.xticks(rotation=60,ha='right')
plt.savefig('../results/'+project+"/plots/"+version+"_scaling_fact_with_conf.pdf")
location= '../../data/'+project+'/res/count/'
mincount_toremove=5
minvar_toremove=0
readcounts, tccounts = readFromSlamdunk(loc=location,minvar_toremove=minvar_toremove, mincount_toremove=mincount_toremove)
All SLAM-seq assays were performed at 60-70% confluency for adherent cells or 60% of the maximum cell density counted on a hemocytometer for suspension cells. 5-7h prior to each assay, growth medium was aspirated and replaced. Unless stated otherwise, cells were pre-treated with indicated small molecule inhibitors or 100µM IAA for 30 min to pre-establish full target inhibition or degradation. Newly synthesized RNA was labeled for indicated time spans (45 min or 60 min) at a final concentration of 100µM 4- thiouridine (4sU, Carbosynth). Adherent cells were harvested by direct snap-freezing of plates on dry ice. Suspension cells were spun down and immediately snap-frozen. RNA extraction was performed using the RNeasy Plus Mini Kit (Qiagen).
Total RNA was subjected to alkylation by iodoacetamide (Sigma, 10mM) for 15 min and RNA was repurified by ethanol precipitation. 500ng alkylated RNA were used as input for generating 3’-end mRNA sequencing libraries using a commercially available kit (QuantSeq 3′ mRNA-Seq Library Prep Kit FWD for Illumina and PCR Add-on Kit for Illumina, Lexogen). Deep sequencing was performed using HiSeq1500 and HiSeq2500 platforms (Illumina).
0.06724463 0.04916348 0.03728357 0.05075007 0.03331248 0.0346662 0.03265504 0.04416262 0.05574821 0.05845953 0.05531433 0.06175393 0.03181749 0.02940226 0.0306263 0.04004278
mkdir ../results/$project
pd.DataFrame(tccounts.values.astype(float)/readcounts.mean().values*100, columns=tccounts.columns, index=tccounts.index).to_csv('../results/' + project + '/'+version + '_'+str(minvar_toremove)+'_'+str(mincount_toremove)+'_ratio.csv')
col = tccounts.columns.tolist()
col.sort()
tccounts = tccounts[col]
col = readcounts.columns.tolist()
col.sort()
readcounts = readcounts[col]
readcounts
readcounts.to_csv('../results/'+project+'/'+version+'_'+str(minvar_toremove)+'_'+str(mincount_toremove)+'_readcounts.csv',index=False)
tccounts.to_csv('../results/'+project+'/'+version+'_'+str(minvar_toremove)+'_'+str(mincount_toremove)+'_tccounts.csv',index=False)
readcounts = pd.read_csv('../results/'+project+'/'+version+'_'+str(minvar_toremove)+'_'+str(mincount_toremove)+'_readcounts.csv')
tccounts = pd.read_csv('../results/'+project+'/'+version+'_'+str(minvar_toremove)+'_'+str(mincount_toremove)+'_tccounts.csv')
mtccounts = pd.DataFrame()
mreadcounts = pd.DataFrame()
for i in set([i.split('-')[2] for i in tccounts.columns]):
mtccounts[i] = tccounts[[v for v in tccounts.columns if i in v]].mean(1)
mreadcounts[i] = readcounts[[v for v in readcounts.columns if i in v]].mean(1)
mtccounts.to_csv("../results/"+project+'/'+version+'_'+str(minvar_toremove)+'_'+str(mincount_toremove)+"_mean_tccounts.csv")
mreadcounts.to_csv("../results/"+project+'/'+version+'_'+str(minvar_toremove)+'_'+str(mincount_toremove)+"_mean_readcounts.csv")
mtccounts = pd.DataFrame()
mreadcounts = pd.DataFrame()
for i in set([i.split('-')[2] for i in tccounts.columns]):
mtccounts[i] = tccounts[[v for v in tccounts.columns if i in v]].median(1)
mreadcounts[i] = readcounts[[v for v in readcounts.columns if i in v]].median(1)
mtccounts.to_csv("../results/"+project+'/'+version+'_'+str(minvar_toremove)+'_'+str(mincount_toremove)+"_median_tccounts.csv")
mreadcounts.to_csv("../results/"+project+'/'+version+'_'+str(minvar_toremove)+'_'+str(mincount_toremove)+"_median_readcounts.csv")
(tccounts/readcounts).fillna(0).to_csv("../results/"+project+'/'+version+'_'+str(minvar_toremove)+'_'+str(mincount_toremove)+"_tccounts_per_readcounts.csv")
we have some outliers, else it seems to make some sense and on average, to be extremelly similar!
tccounts.columns = [i.replace('-','.') for i in tccounts.columns]
readcounts.columns = [i.replace('-','.') for i in readcounts.columns]
mkdir ../results/$project/plots/
tccounts
(tccounts.loc["MYC"]/tccounts.mean(0)) /(readcounts.loc["MYC"]/readcounts.mean(0))
%matplotlib inline
sns.heatmap(tccounts.corr(),
xticklabels=tccounts.columns,
yticklabels=tccounts.columns)
plt.savefig('../results/'+project+'/plots/'+version+'_similarity_replicates_sorted_readcounts.pdf')
%matplotlib inline
sns.heatmap(readcounts.corr(),
xticklabels=readcounts.columns,
yticklabels=readcounts.columns)
plt.savefig('../results/'+project+'/plots/'+version+'_similarity_replicates.pdf')
%matplotlib inline
sns.heatmap(readcounts[readcounts.columns[:16]].corr(),
xticklabels=readcounts.columns[:16],
yticklabels=readcounts.columns[:16])
plt.savefig('../results/'+project+'/plots/'+version+'_similarity_replicates_MYBi.pdf')
The change with DMSO (PBS) 30m vs 6h is as striking to as the change with MYBi highlighting the impact of batches
ctf=pd.read_csv('../data/CTF.csv',header=None)[0].values.tolist()
ctf
set(ctf) & set(tccounts.index)
ctfpos = [val for val in tccounts.index if val in ctf]
notctfpos = [val for val in tccounts.index if val not in ctf]
ctfpos.extend(['EGR1','SERTAD1'])
We find a CTF not in the dataset
how core transcription factors change accross time when the cell is with JQ1 conditions
we are looking at the difference between production of each CTF compared to the average RNA production with JQ1 and without JQ1
readcounts["genes"] = readcounts.index
readcounts = readcounts.reset_index(drop=True)
tccounts["genes"] = tccounts.index
tccounts = tccounts.reset_index(drop=True)
tccountsMEF2D= tccounts[tccounts.columns[:16]]
designMEF2D2 = pd.DataFrame(index= [i.replace('-','.') for i in tccountsMEF2D.columns],
columns=['DMSO','VHL'],
data=np.array([[1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0],
[0,0,0,0,1,1,1,1,0,0,0,0,0,0,0,0]],dtype=bool).T)
designMEF2D24 = pd.DataFrame(index=[i.replace('-','.') for i in tccountsMEF2D.columns],
columns=['DMSO','VHL'],
data=np.array([[0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0],
[0,0,0,0,0,0,0,0,0,0,0,0,1,1,1,1]],dtype=bool).T)
tccountsMybi[tccountsMybi.columns[np.array(Mybi+[1],np.bool)]]
### MYCi
tccountsMybi= tccounts[tccounts.columns[16:-1]]
designMybi24 = pd.DataFrame(index=[i.replace('-','.') for i in tccountsMybi.columns],
columns=['DMSO','VHL'],
data=np.array([[1,1,1,1,0,0,0],
[0,0,0,0,1,1,1]],dtype=bool).T)
For gene-level analysis, raw reads mapped to different UTR annotations of the same gene were summed up by Entrez Gene ID. Pilot studies of K562 cells with kinase inhibitors were performed as single experiments.
Analysis of differential gene expression was restricted to genes with ≥ 10 reads in at least one condition for 50bp sequencing runs (flavopiridol and DMSO) or ≥ 20 reads in at least one condition for 100bp sequencing runs (mk2206, trametinib, nilotinib, trametinib + mk2206 and DMSO). For estimating differential expression, a pseudo-count of 1 raw read was added to all genes.
Differential gene expression calling was performed on raw read counts with ≥ 2 T>C conversions using DESeq2 (version 1.14.1) with default settings, and with size factors estimated on corresponding total mRNA reads for global normalization.
Downstream analysis was restricted to genes passing all internal filters for FDR estimation by DESeq2. Principal component analysis was performed after variance stabilizing transformation on the 500 most variable genes across all conditions of a given experiment. GO-term enrichment analysis was performed on genes significantly and strongly downregulated (FDR ≤ 0.1, log2FC ≤ -1) in SLAM-seq upon IAA-treatment in K562MYC-AID + Tir1 by the PANTHER Overrepresentation Test (Fisher's Exact with FDR multiple test correction, release 20171205, http://pantherdb.org) on GO Ontology database Released 2017-12-27.
scaling="ERCCtrial"
Mybi = [1,1,1,1,1,1,1]
tccountsMybi= tccounts[tccounts.columns[16:]]
deseqMybi = pyDESeq2.pyDESeq2(count_matrix=tccountsMybi[tccountsMybi.columns[np.array(Mybi+[1],np.bool)]], design_matrix=designMybi24[np.array(Mybi,np.bool)],
design_formula="~DMSO - VHL",
gene_column="genes")
MEF2D2 = [1,1,1,1,1,1,1,1,0,0,0,0,0,0,0,0]
tccountsMEF2D= tccounts[list(tccounts.columns[:16])+list(tccounts.columns[-1:])]
deseqMEF2D2 = pyDESeq2.pyDESeq2(count_matrix = tccountsMEF2D[tccountsMEF2D.columns[np.array(MEF2D2+[1],np.bool)]],
design_matrix=designMEF2D2[np.array(MEF2D2,np.bool)],
design_formula="~DMSO - VHL",
gene_column="genes")
MEF2D24 = [0,0,0,0,0,0,0,0,1,1,1,1,1,1,1,1]
deseqMEF2D24 = pyDESeq2.pyDESeq2(count_matrix = tccountsMEF2D[tccountsMEF2D.columns[np.array(MEF2D24+[1],np.bool)]],
design_matrix=designMEF2D24[np.array(MEF2D24,np.bool)],
design_formula="~DMSO - VHL",
gene_column="genes")
readcountsMyci= readcounts[readcounts.columns[15:-1]]
deseqMS2.run_estimate_size_factors(geoMeans = np.exp(np.mean(np.log(
readcountsMyci[readcountsMyci.columns[np.array([1,1,1,0,0,0,0,0,0,0,0,0], np.bool)]].values+1), 1)))
readcountsMybi= readcounts[readcounts.columns[:15]]
deseqMybi30.run_estimate_size_factors(geoMeans = np.exp(np.mean(np.log(readcountsMybi[readcountsMybi.columns[
np.array([1,1,1,1,0,0,0,0,0,0,0,0,0,0,0],np.bool)]].values+1),1)))
deseqMybi6.run_estimate_size_factors(geoMeans = np.exp(np.mean(np.log(readcountsMybi[readcountsMybi.columns[
np.array([0,0,0,0,0,0,0,0,1,1,1,1,0,0,0],np.bool)]].values+1),1)))
readcountsMEF2D= readcounts[readcounts.columns[:16]]
deseqMEF2D2.run_estimate_size_factors(geoMeans = np.exp(np.mean(np.log(readcountsMEF2D[readcountsMEF2D.columns[
np.array([1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0],np.bool)]].values+1),1)))
deseqMEF2D24.run_estimate_size_factors(geoMeans = np.exp(np.mean(np.log(readcountsMEF2D[readcountsMEF2D.columns[
np.array([0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0],np.bool)]].values+1),1)))
readcountsMybi= readcounts[readcounts.columns[16:-1]]
deseqMybi.run_estimate_size_factors(geoMeans = np.exp(np.mean(np.log(
readcountsMybi[readcountsMybi.columns[np.array([1,1,1,1,0,0,0], np.bool)]].values+1), 1)))
# from https://www.cell.com/trends/genetics/pdf/S0168-9525(13)00089-9.pdf FFROM THOUSANDS OF SAMPLES
housekeeping1 = ["C1orf43", "CHMP2A", "EMC7", "GPI", "PSMB2", "PSMB4", "RAB7A", "REEP5", "SNRPD3", "VCP", "VPS29"]
#https://www.ncbi.nlm.nih.gov/pmc/articles/PMC4760967/ FOR CANCER CELL LINES
housekeeping2 = ['18S rRNA',
'ACTB',
'B2M',
'G6PD',
'GAPDH',
'GUSB',
'HMBS',
'HPRT1',
'PGK1',
'PPIA',
'RPL13a',
'SDHA',
'TBP',
'TUBB',
'YWHAZ']
housekeeping = readcounts.genes.isin(housekeeping2)
readcountsMybi= readcounts[readcounts.columns[16:-1]]
np.exp(np.mean(np.log(
readcountsMybi[readcountsMybi.columns[np.array([1,1,1,1,0,0,0], np.bool)]].values+1), 1))
readcountsMybi= readcounts[readcounts.columns[16:-1]]
deseqMybi.run_estimate_size_factors(geoMeans = np.exp(np.mean(np.log(
readcountsMybi[readcountsMybi.columns[np.array([1,1,1,1,0,0,0], np.bool)]].values+1), 1)), controlGenes=housekeeping)
readcountsMEF2D= readcounts[readcounts.columns[:16]]
deseqMEF2D2.run_estimate_size_factors(geoMeans = np.exp(np.mean(np.log(readcountsMEF2D[readcountsMEF2D.columns[
np.array([1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0],np.bool)]].values+1),1)), controlGenes=housekeeping)
deseqMEF2D24.run_estimate_size_factors(geoMeans = np.exp(np.mean(np.log(readcountsMEF2D[readcountsMEF2D.columns[
np.array([0,0,0,0,0,0,0,0,1,1,1,1,0,0,0,0],np.bool)]].values+1),1)), controlGenes=housekeeping)
sizeFact = deseqMybi.getSizeFactors()
sizeFact
r
sizeFact[4:] = sizeFact[4:]*(r['MYBi_24h'][0]/r['PBS_24h'][0])
deseqMybi.setSizeFactors(sizeFact)
sizeFact = deseqMEF2D24.getSizeFactors()
sizeFact[4:] = sizeFact[4:]*(r['VHL_24h'][0]/r['DMSO_24h'][0])
deseqMEF2D24.setSizeFactors(sizeFact)
deseqMybi.run_deseq()
deseqMybi.get_deseq_result()
resMybi = deseqMybi.deseq_result
resMybi.pvalue = np.nan_to_num(np.array(resMybi.pvalue), 1)
resMybi.log2FoldChange = np.nan_to_num(np.array(resMybi.log2FoldChange), 0)
resMybi.log2FoldChange = -resMybi.log2FoldChange
resMybi["gene_id"] = resMybi.genes
deseqMEF2D2.run_deseq()
deseqMEF2D24.run_deseq()
deseqMEF2D2.get_deseq_result()
deseqMEF2D24.get_deseq_result()
resMEF2D2 = deseqMEF2D2.deseq_result
resMEF2D24 = deseqMEF2D24.deseq_result
resMEF2D2.pvalue = np.nan_to_num(np.array(resMEF2D2.pvalue), 1)
resMEF2D2.log2FoldChange = np.nan_to_num(np.array(resMEF2D2.log2FoldChange), 0)
resMEF2D24.pvalue = np.nan_to_num(np.array(resMEF2D24.pvalue), 1)
resMEF2D24.log2FoldChange = np.nan_to_num(np.array(resMEF2D24.log2FoldChange), 0)
resMEF2D24.log2FoldChange = -resMEF2D24.log2FoldChange
resMEF2D2.log2FoldChange = -resMEF2D2.log2FoldChange
resMEF2D2["gene_id"] = resMEF2D2.genes
resMEF2D24["gene_id"] = resMEF2D24.genes
deseqMS2.run_deseq()
deseqJQ1.run_deseq()
deseqMS2_JQ1.run_deseq()
deseqMS2.get_deseq_result()
deseqJQ1.get_deseq_result()
deseqMS2_JQ1.get_deseq_result()
resMS2 = deseqMS2.deseq_result
resJQ1 = deseqJQ1.deseq_result
resMS2_JQ1 = deseqMS2_JQ1.deseq_result
resMS2.pvalue = np.nan_to_num(np.array(resMS2.pvalue), 1)
resMS2.log2FoldChange = np.nan_to_num(np.array(resMS2.log2FoldChange), 0)
resJQ1.pvalue = np.nan_to_num(np.array(resJQ1.pvalue), 1)
resJQ1.log2FoldChange = np.nan_to_num(np.array(resJQ1.log2FoldChange), 0)
resMS2_JQ1.pvalue = np.nan_to_num(np.array(resMS2_JQ1.pvalue), 1)
resMS2_JQ1.log2FoldChange = np.nan_to_num(np.array(resMS2_JQ1.log2FoldChange), 0)
resMS2.log2FoldChange = -resMS2.log2FoldChange
resJQ1.log2FoldChange = -resJQ1.log2FoldChange
resMS2_JQ1.log2FoldChange = -resMS2_JQ1.log2FoldChange
resMS2["gene_id"] = resMS2.genes
resJQ1["gene_id"] = resJQ1.genes
resMS2_JQ1["gene_id"] = resMS2_JQ1.genes
deseqMybi30.run_deseq()
deseqMybi6.run_deseq()
deseqMybi30.get_deseq_result()
deseqMybi6.get_deseq_result()
resMybi30 = deseqMybi30.deseq_result
resMybi6 = deseqMybi6.deseq_result
resMybi30.pvalue = np.nan_to_num(np.array(resMybi30.pvalue), 1)
resMybi30.log2FoldChange = np.nan_to_num(np.array(resMybi30.log2FoldChange), 0)
resMybi6.pvalue = np.nan_to_num(np.array(resMybi6.pvalue), 1)
resMybi6.log2FoldChange = np.nan_to_num(np.array(resMybi6.log2FoldChange), 0)
resMybi6.log2FoldChange = -resMybi6.log2FoldChange
resMybi30.log2FoldChange = -resMybi30.log2FoldChange
resMybi30["gene_id"] = resMybi30.genes
resMybi6["gene_id"] = resMybi6.genes
%matplotlib inline
res = resMybi30[resMybi30.baseMean>10]
res.baseMean= np.log2(1+res.baseMean)
res["type"] = ['ctf' if i else "other" for i in res.genes.isin(ctf)]
ax = sns.boxplot(data=res,x='type',y='log2FoldChange').set_title("Mybi at 30mn foldChange")
ax.figure.savefig('../results/'+project+"/plots/"+version+"_"+scaling+"_whiskersMybi_30mn_logfch.pdf")
ax =sns.boxplot(data=res,x='type',y='baseMean')
ax.set_title("Mybi 30mn baseMean")
ax.figure.savefig('../results/'+project+"/plots/"+version+"_"+scaling+"_whiskers_Mybi_30mn_baseMean.pdf")
res = resMybi6[resMybi6.baseMean>10]
res.baseMean= np.log2(1+res.baseMean)
res["type"] = ['ctf' if i else "other" for i in res.genes.isin(ctf)]
ax= sns.boxplot(data=res,x='type',y='log2FoldChange')
ax.set_title("Mybi at 6h foldchange")
ax.figure.savefig('../results/'+project+"/plots/"+version+"_"+scaling+"_whiskers_Mybi_6h_logfch.pdf")
ax = sns.boxplot(data=res,x='type',y='baseMean')
ax.set_title("Mybi at 6h baseMean")
ax.figure.savefig('../results/'+project+"/plots/"+version+"_"+scaling+"_whiskers_Mybi_6h_baseMean.pdf")
res = resMybi[resMybi.baseMean>10]
res.baseMean= np.log2(1+res.baseMean)
res["type"] = ['ctf' if i else "other" for i in res.genes.isin(ctf)]
ax= sns.boxplot(data=res,x='type',y='log2FoldChange')
ax.set_title("Mybi at 24h foldchange")
ax.figure.savefig('../results/'+project+"/plots/"+version+"_"+scaling+"_whiskers_Mybi_24h_logfch.pdf")
ax = sns.boxplot(data=res,x='type',y='baseMean')
ax.set_title("Mybi at 24h baseMean")
ax.figure.savefig('../results/'+project+"/plots/"+version+"_"+scaling+"_whiskers_Mybi_24h_baseMean.pdf")
res = resMEF2D2[resMEF2D2.baseMean>10]
res.baseMean= np.log2(1+res.baseMean)
res["type"] = ['ctf' if i else "other" for i in res.genes.isin(ctf)]
ax= sns.boxplot(data=res,x='type',y='log2FoldChange')
ax.set_title("MEF2D at 2h foldchange")
ax.figure.savefig('../results/'+project+"/plots/"+version+"_"+scaling+"_whiskers_MEF2D_2h_logfch.pdf")
ax = sns.boxplot(data=res,x='type',y='baseMean')
ax.set_title("MEF2D at 2h baseMean")
ax.figure.savefig('../results/'+project+"/plots/"+version+"_"+scaling+"_whiskers_MEF2D_2h_baseMean.pdf")
res = resMEF2D24[resMEF2D24.baseMean>10]
res.baseMean= np.log2(1+res.baseMean)
res["type"] = ['ctf' if i else "other" for i in res.genes.isin(ctf)]
ax= sns.boxplot(data=res,x='type',y='log2FoldChange')
ax.set_title("MEF2D at 24h foldchange")
ax.figure.savefig('../results/'+project+"/plots/"+version+"_"+scaling+"_whiskers_MEF2D_24h_logfch.pdf")
ax = sns.boxplot(data=res,x='type',y='baseMean')
ax.set_title("MEF2D at 24h baseMean")
ax.figure.savefig('../results/'+project+"/plots/"+version+"_"+scaling+"_whiskers_MEF2D_24h_baseMean.pdf")
res = resJQ1[resJQ1.baseMean>10]
res.baseMean= np.log2(1+res.baseMean)
res["type"] = ['ctf' if i else "other" for i in res.genes.isin(ctf)]
ax = sns.boxplot(data=res,x='type',y='log2FoldChange')
ax.set_title("JQ1 foldchange")
ax.figure.savefig('../results/'+project+"/plots/"+version+"_"+scaling+"_whiskers_JQ1_logfch.pdf")
ax =sns.boxplot(data=res,x='type',y='baseMean')
ax.set_title("JQ1 base")
ax.figure.savefig('../results/'+project+"/plots/"+version+"_"+scaling+"_whiskers_JQ1_baseMean.pdf")
res = resMS2[resMS2.baseMean>10]
res.baseMean= np.log2(1+res.baseMean)
res["type"] = ['ctf' if i else "other" for i in res.genes.isin(ctf)]
ax =sns.boxplot(data=res,x='type',y='log2FoldChange')
ax.set_title("MS2 foldChange")
ax.figure.savefig('../results/'+project+"/plots/"+version+"_"+scaling+"_whiskers_MS2_logfch.pdf")
ax = sns.boxplot(data=res,x='type',y='baseMean')
ax.set_title("MS2 Base")
ax.figure.savefig('../results/'+project+"/plots/"+version+"_"+scaling+"_whiskers_MS2_baseMean.pdf")
res = resMS2_JQ1[resMS2_JQ1.baseMean>10]
res.baseMean= np.log2(1+res.baseMean)
res["type"] = ['ctf' if i else "other" for i in res.genes.isin(ctf)]
ax = sns.boxplot(data=res,x='type',y='log2FoldChange')
ax.set_title("MS2+JQ1 foldChange")
ax.figure.savefig('../results/'+project+"/plots/"+version+"_"+scaling+"_whiskers_MS2_JQ1_logfch.pdf")
ax = sns.boxplot(data=res,x='type',y='baseMean')
ax.set_title("MS2+JQ1 Base")
ax.figure.savefig('../results/'+project+"/plots/"+version+"_"+scaling+"_whiskers_MS2_JQ1_baseMean.pdf")
mix = pd.DataFrame()
mix["gene_id"] = resMybi30["gene_id"]
mix['Mybi 30mn'] = resMybi30.log2FoldChange
mix['Mybi 6h'] = resMybi6.log2FoldChange
scatter(mix[['Mybi 30mn','Mybi 6h']].values[:12000],
mix['gene_id'].values.tolist()[:12000], radi= 0.06, alpha=0.3,
colors = [0 if i in ctf else 1 for i in mix['gene_id'].values.tolist()[:12000]],
xname="Mybi 30mn",
yname="Mybi 6h",
folder='../results/'+project+"/plots/"+version+"_"+scaling+"_",
title='Mybi 30mn vs 6h differences in logFoldChange')
resMS2.to_csv("../results/"+project+"/"+version+'_'+scaling+"_"+str(minvar_toremove)+'_'+str(mincount_toremove)+'_MS2_deseq.csv')
resJQ1.to_csv("../results/"+project+"/"+version+'_'+scaling+"_"+str(minvar_toremove)+'_'+str(mincount_toremove)+'_JQ1_deseq.csv')
resMS2_JQ1.to_csv("../results/"+project+"/"+version+'_'+scaling+"_"+str(minvar_toremove)+'_'+str(mincount_toremove)+'_MS2_JQ1_deseq.csv')
resMybi30.to_csv("../results/"+project+"/"+version+'_'+scaling+'_'+str(minvar_toremove)+'_'+str(mincount_toremove)+'_Mybi_30um_deseq.csv')
resMybi6.to_csv("../results/"+project+"/"+version+'_'+scaling+'_'+str(minvar_toremove)+'_'+str(mincount_toremove)+'_Mybi_6h_deseq.csv')
resMybi.to_csv("../results/"+project+"/"+version+'_'+scaling+'_'+str(minvar_toremove)+'_'+str(mincount_toremove)+'_Mybi_24h_deseq.csv')
resMEF2D2.to_csv("../results/"+project+"/"+version+'_'+scaling+'_'+str(minvar_toremove)+'_'+str(mincount_toremove)+'_MEF2D_2h_deseq.csv')
resMEF2D24.to_csv("../results/"+project+"/"+version+'_'+scaling+'_'+str(minvar_toremove)+'_'+str(mincount_toremove)+'_MEF2D_24h_deseq.csv')
we can conclude that we get similar results to the slamseq myc paper although it seems that our values are a bit skewed toward higher expression than what is on the slamseq paper. It mightt be explained by the pseudo count of 1 that I did not set. Because I think it would highly bias the DESeq algorithm.
show(volcano(resMS2,tohighlight=ctf, searchbox=True, title='DESeq results of MV411 under MS2 in volcano plot', folder='../results/'+project+'/plots/'+version+'_'+scaling+"_"+str(minvar_toremove)+'_'+str(mincount_toremove)))
show(volcano(resJQ1,tohighlight=ctf, searchbox=True, title='DESeq results of MV411 under JQ1 in volcano plot', folder='../results/'+project+'/plots/'+version+'_'+scaling+"_"+str(minvar_toremove)+'_'+str(mincount_toremove)))
show(volcano(resMS2_JQ1,tohighlight=ctf, searchbox=True, title='DESeq results of MV411 under MS2 and JQ1 in volcano plot', folder='../results/'+project+'/plots/'+version+'_'+scaling+"_"+str(minvar_toremove)+'_'+str(mincount_toremove)))
resMybi30[resMybi30.gene_id=="MYB"]
show(volcano(resMybi30,tohighlight=ctf, searchbox=True, title="Mybi at 30mn", folder='../results/'+project+'/plots/'+version+'_'+scaling+"_"+str(minvar_toremove)+'_'+str(mincount_toremove)))
resMybi6[resMybi6.gene_id=='MYB']
show(volcano(resMybi6,tohighlight=ctf, searchbox=True, title="Mybi at 6h", folder='../results/'+project+'/plots/'+version+'_'+scaling+"_"+str(minvar_toremove)+'_'+str(mincount_toremove),maxvalue=50))
show(volcano(resMybi,tohighlight=ctf, searchbox=True, title="Mybi at 24h", folder='../results/'+project+'/plots/'+version+'_'+scaling+"_"+str(minvar_toremove)+'_'+str(mincount_toremove),maxvalue=50))
show(volcano(resMEF2D2,tohighlight=ctf, searchbox=True, title="MEF2D at 2h", folder='../results/'+project+'/plots/'+version+'_'+scaling+"_"+str(minvar_toremove)+'_'+str(mincount_toremove),maxvalue=50))
show(volcano(resMEF2D24,tohighlight=ctf, searchbox=True, title="MEF2D at 24h", folder='../results/'+project+'/plots/'+version+'_'+scaling+"_"+str(minvar_toremove)+'_'+str(mincount_toremove),maxvalue=50))